# The sentiment function takes a really long time so I created a new data file so you don't have to run it
us_tweets <- read_csv("us_tweets.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_integer(),
## tweet_id = col_double(),
## date = col_date(format = ""),
## hour = col_time(format = ""),
## user_name = col_character(),
## nickname = col_character(),
## bio = col_character(),
## tweet_content = col_character(),
## latitude = col_double(),
## longitude = col_double(),
## country = col_character(),
## place_as_appears_on_bio = col_character(),
## profile_picture = col_character(),
## tweet_url = col_character()
## )
## See spec(...) for full column specifications.
#gets rid of non alpha-numeric characters
us_tweets$tweet_content_stripped <- gsub("[^[:alnum:] ]", "",
us_tweets$tweet_content)
#removes all words that are 1-2 letters long
us_tweets$tweet_content_stripped <- gsub(" *\\b[[:alpha:]]{1,2}\\b *", " ",
us_tweets$tweet_content_stripped)
sentimentTotals <- data.frame(colSums(us_tweets[,c(20:27)]))
names(sentimentTotals) <- "count"
sentimentTotals <- cbind("sentiment" = rownames(sentimentTotals),
sentimentTotals)
sentimentTotals
## sentiment count
## anger anger 13605
## anticipation anticipation 52960
## disgust disgust 12668
## fear fear 19942
## joy joy 46690
## sadness sadness 21882
## surprise surprise 22067
## trust trust 76347
us_tweets$hour <- as.POSIXct(us_tweets$hour, format = " %H:%M")
ggplot(data = us_tweets, aes(x = hour)) +
geom_histogram(stat = "count") +
xlab("Time") + ylab("Proportion of tweets") +
ggtitle("Number of Tweets per Hour") +
scale_x_datetime(labels = date_format("%H:%M"))
## Warning: Ignoring unknown parameters: binwidth, bins, pad

us_tweets$charsintweet <- sapply(us_tweets$tweet_content, function(x) nchar(x))
ggplot(data = us_tweets, aes(x = charsintweet)) +
geom_histogram(aes(fill = ..count..), binwidth = 8) +
theme(legend.position = "none") +
xlab("Characters per Tweet") +
ylab("Number of tweets") +
scale_fill_gradient(low = "midnightblue", high = "aquamarine4") +
xlim(0,150) +
ggtitle("Characters per Tweet")
## Warning: Removed 6 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).

ggplot(data = sentimentTotals, aes(x = sentiment, y = count)) +
geom_bar(aes(fill = sentiment), stat = "identity") +
theme(legend.position = "none") +
xlab("Sentiment") +
ylab("Total Count") +
ggtitle("Total Sentiment Score for All Tweets in Sample")

tweet_words <- us_tweets %>%
unnest_tokens(word, tweet_content_stripped)
data(stop_words)
tweet_words <-
anti_join(tweet_words, stop_words)
## Joining, by = "word"
tweet_words %>%
count(word) %>%
with(wordcloud(word, n, max.words = 200,
random.order = FALSE,
rot.per = 0.35,
colors = brewer.pal(2, "Dark2")))
## Warning in brewer.pal(2, "Dark2"): minimal value for n is 3, returning requested palette with 3 different levels

pal2 <- brewer.pal(8,"Dark2")
tweet_words %>%
count(word, sort = TRUE) %>%
top_n(10) %>%
mutate(word = fct_reorder(word, n)) %>%
ggplot(aes(x = word, y = n)) +
geom_bar(stat = "identity", fill = "blue", alpha = .6) +
coord_flip()
## Selecting by n

hashtags <- str_extract_all(us_tweets$tweet_content, "#\\S+")
hashtags <- unlist(hashtags)
hashtags <- tolower(hashtags)
hashtags <- gsub("[^[:alnum:] ]", "", hashtags)
hashtag.df <- data.frame(table(hashtags))
hashtag.df$hashtags <- as.character(hashtag.df$hashtags)
hashtag.df$Freq <- as.numeric(as.character(hashtag.df$Freq))
hashtag.df <- arrange(hashtag.df, desc(Freq))
print(hashtag.df[1:20,])
## hashtags Freq
## 1 job 51511
## 2 hiring 45428
## 3 jobs 21910
## 4 careerarc 20717
## 5 retail 7454
## 6 hospitality 7311
## 7 nursing 5091
## 8 healthcare 4702
## 9 veterans 4471
## 10 sales 3310
## 11 it 2179
## 12 customerservice 1927
## 13 transportation 1568
## 14 sonic 1520
## 15 manufacturing 1476
## 16 photo 1432
## 17 businessmgmt 1348
## 18 accounting 1053
## 19 engineering 970
## 20 traffic 955
us_tweets %>%
filter(country == "US") %>%
mutate(text_label = str_c("followers: ", followers, '\nlocation: ', place_as_appears_on_bio)) %>%
plot_ly(x = ~longitude, y = ~latitude, type = "scatter", mode = "markers",
alpha = 0.5,
color = ~followers, text = ~text_label)